***********************************************************
*** CLEAN DATASET FOR 50, 150, 100-MILE RADIUS ANALYSIS ***
*** Last edited: 3/13/2022         		   	   	  ***
***********************************************************
log using "${CodePath}/log/clean_comparisongroups_$S_DATE.log", text replace



*--------------------------------------------------
* 100 MILE RADIUS
*--------------------------------------------------

// Merge and clean hospital and hospital comparison group data -- available in replication packet
// hospital-hospital distances
import delimited "${DataPath}/shapefiles/hospcoords_within100m_distmatrix_clean.csv", clear
rename v* dist*
reshape long dist, i(id) j(id2)
rename dist dist_m
gen dist_km = dist_m/1000
gen dist_mile = 0.000621371 * dist_m
keep dist_km dist_mile id id2

rename id pn_1
rename id2 pn_2

preserve
	use pn city state using "${DataPath}/pos2010_clean.dta", clear
	rename city city_1
	rename state state_1
	rename pn pn_1
	tempfile temp
	save `temp'
restore
merge m:1 pn_1 using `temp', keep(1 3) nogen

preserve
	use pn city state using "${DataPath}/pos2010_clean.dta", clear
	rename city city_2
	rename state state_2
	rename pn pn_2
	tempfile temp
	save `temp'
restore
merge m:1 pn_2 using `temp', keep(1 3) nogen

preserve
	import delimited "${DataPath}/shapefiles/hospcoord_to_splitatstate.csv", clear
	keep pn split_id
	rename pn pn_1
	rename split_id split_id_1
	tempfile temp
	save `temp'
restore
merge m:1 pn_1 using `temp', keep(1 3) nogen

preserve
	import delimited "${DataPath}/shapefiles/hospcoord_to_splitatstate.csv", clear
	keep pn split_id
	rename pn pn_2
	rename split_id split_id_2
	tempfile temp
	save `temp'
restore
merge m:1 pn_2 using `temp', keep(1 3) nogen

gen     RACregion_1 = 1 if inlist(state_1, "ME", "VT", "NH", "MA", "CT", "RI", "NJ", "NY", "PA")
replace RACregion_1 = 1 if inlist(state_1, "MD", "DE")
replace RACregion_1 = 2 if inlist(state_1, "MN", "WI", "MI", "IL", "IN", "MI", "OH", "KY")
replace RACregion_1 = 3 if inlist(state_1, "WV", "VA", "NC", "SC", "TN", "GA", "AL", "MS", "FL")
replace RACregion_1 = 3 if inlist(state_1, "LA", "AR", "OK", "TX", "NM", "CO", "DC", "PR")
replace RACregion_1 = 4 if inlist(state_1, "ND", "SD", "IA", "MO", "KS", "NE", "MT", "WY", "ID")
replace RACregion_1 = 4 if inlist(state_1, "UT", "AZ", "NV", "WA", "OR", "CA", "AK", "HI")

gen     RACregion_2 = 1 if inlist(state_2, "ME", "VT", "NH", "MA", "CT", "RI", "NJ", "NY", "PA")
replace RACregion_2 = 1 if inlist(state_2, "MD", "DE")
replace RACregion_2 = 2 if inlist(state_2, "MN", "WI", "MI", "IL", "IN", "MI", "OH", "KY")
replace RACregion_2 = 3 if inlist(state_2, "WV", "VA", "NC", "SC", "TN", "GA", "AL", "MS", "FL")
replace RACregion_2 = 3 if inlist(state_2, "LA", "AR", "OK", "TX", "NM", "CO", "DC", "PR")
replace RACregion_2 = 4 if inlist(state_2, "ND", "SD", "IA", "MO", "KS", "NE", "MT", "WY", "ID")
replace RACregion_2 = 4 if inlist(state_2, "UT", "AZ", "NV", "WA", "OR", "CA", "AK", "HI")

// conditions for being in the same comparison group
// 1. not same RAC
gen diffRAC = cond(RACregion_1 != RACregion_2, 1, 0)

// 2. within 100 miles away
gen within100m = cond(dist_mile <= 100, 1, 0)
gen within50m = cond(dist_mile <= 50, 1, 0)

// 3. same state segment
gen samesegment = cond(split_id_1 == split_id_2, 1, 0)

// all 3
gen ingroup_100 = (diffRAC & within100m & samesegment)

gen ingroup_50 = (diffRAC & within50m & samesegment)

gen sameRACgroup_100 = (diffRAC == 0 & within100m & samesegment)


bys pn_1: egen tot_comparehosp_100 = total(ingroup_100)
bys pn_1: egen tot_comparehosp_50 = total(ingroup_50)

egen tag = tag(pn_1)
sum tot_comparehosp_100 if tag, detail
// number of hospitals that are being compared against each hospital
/*                        
                       tot_comparehosp
-------------------------------------------------------------
      Percentiles      Smallest
 1%            0              0
 5%            0              0
10%            0              0       Obs                 788
25%            0              0       Sum of Wgt.         788

50%            2                      Mean           6.751269
                        Largest       Std. Dev.      8.973719
75%           11             39
90%           20             39       Variance       80.52763
95%           26             40       Skewness       1.550096
99%           38             41       Kurtosis       4.844133

 */
save "${DataPath}/within100m/pn_distmatrix.dta", replace

/* // calculate jackknife -- audit rate of all hospitals within 100m on the same RAC
use "${DataPath}/within100m/pn_distmatrix.dta", clear
// hospitals within 100 miles with the same RAC as me
bys pn_1: egen tot_inRACgroup_100 = total(sameRACgroup_100)
sum tot_inRACgroup_100 if tag, detail

preserve
	use  "${OutputPath}/Audit Claims Merge/predictionmodels2/model_allyears_5.3/predictions/byprovider_audit_in2011_3y.dta", clear
	rename provider pn_2
	gen nonnum = real(pn_2) == .
	tab nonnum
	drop if nonnum
	destring pn_2, replace
	rename mean_audit_in2011 pn_2_audit_in2011
	tempfile temp
	save `temp'
restore
merge m:1 pn_2 using `temp', keep(1 3)
drop nonnum mean_pred_m53_audit_in2011
drop _merge

keep if sameRACgroup_100 == 1 & pn_1 != pn_2
gcollapse (mean) jk_RAC100_audit_in2011 = pn_2_audit_in2011, by(pn_1)
rename pn_1 pn
save "${DataPath}/within100m/jk_RAC100_audit_in2011.dta", replace */


preserve
	// compare to average number of hospitals per segment in FE model
	 use "${DataPath}/derived/hospyear_0716_jk.dta", clear
	 keep if dist_to_border <= 100
	 egen seg_tag = tag(nearseg100_state_2)
	 egen hosptag = tag(pn)
	 bys nearseg100_state_2: egen tot_hosp = total(hosptag)
	 sum tot_hosp if seg_tag, detail
restore

// define groups by within 100m
use "${DataPath}/within100m/pn_distmatrix.dta", clear
keep if ingroup_100 == 1 | pn_1 == pn_2
sort pn_1 pn_2

gen strL hospcomp_group_100 = ""

levelsof pn_1, local(hosplist)
foreach hosp of local hosplist{
	di "`hosp'"
	 qui levelsof pn_2 if pn_1 == `hosp', local(pn_2list)
	 replace hospcomp_group_100 = "`pn_2list'" if pn_1 == `hosp'
}

bys pn_1: egen n_hospcomp_ingroup_100 = count(pn_2)

// a list of hospitals and the comparison groups (circles around hospital of radius 100 mile) they are a part of, repeated for the number of times they are in a group
keep pn_2 hospcomp_group_100 n_hospcomp_ingroup_100 split_id_2 state_2
rename pn_2 pn
rename split_id_2 split_id
rename state_2 state
bys pn: egen n_group_partof_100 = count(hospcomp_group_100)
egen tag = tag(pn)
sum n_group_partof_100 if tag, detail

egen tag2 = tag(hospcomp_group_100)
sum n_hospcomp_ingroup_100 if tag2, detail


drop tag*
save "${DataPath}/within100m/pnlist_groups_100m.dta", replace



*--------------------------------------------------
* 50 MILE RADIUS
*--------------------------------------------------
use "${DataPath}/within100m/pn_distmatrix.dta", clear
keep if ingroup_50 == 1 | pn_1 == pn_2
sort pn_1 pn_2

gen strL hospcomp_group_50 = ""

levelsof pn_1, local(hosplist)
foreach hosp of local hosplist{
	di "`hosp'"
	 qui levelsof pn_2 if pn_1 == `hosp', local(pn_2list)
	 replace hospcomp_group_50 = "`pn_2list'" if pn_1 == `hosp'
}

bys pn_1: egen n_hospcomp_ingroup_50 = count(pn_2)

// a list of hospitals and the comparison groups (circles around hospital of radius 100 mile) they are a part of, repeated for the number of times they are in a group
keep pn_2 hospcomp_group_50 n_hospcomp_ingroup_50 split_id_2 state_2
rename pn_2 pn
rename split_id_2 split_id
rename state_2 state
bys pn: egen n_group_partof_50 = count(hospcomp_group_50)
egen tag = tag(pn)
sum n_group_partof_50 if tag, detail


egen tag2 = tag(hospcomp_group_50)
sum n_hospcomp_ingroup_50 if tag2, detail


drop tag*
save "${DataPath}/within50m/pnlist_groups_50m.dta", replace



*--------------------------------------------------
* 150 MILE RADIUS
*--------------------------------------------------
// Merge and clean hospital and hospital comparison group data
// hospital-hospital distances
import delimited "${DataPath}/shapefiles/hospcoord_within_150m_distmatrix.csv", clear
rename inputid id
rename targetid id2
*reshape long dist, i(id) j(id2)
rename dist dist_m
gen dist_km = dist_m/1000
gen dist_mile = 0.000621371 * dist_m
keep dist_km dist_mile id id2

rename id pn_1
rename id2 pn_2

preserve
	use pn city state using "${DataPath}/pos2010_clean.dta", clear
	rename city city_1
	rename state state_1
	rename pn pn_1
	tempfile temp
	save `temp'
restore
merge m:1 pn_1 using `temp', keep(1 3) nogen

preserve
	use pn city state using "${DataPath}/pos2010_clean.dta", clear
	rename city city_2
	rename state state_2
	rename pn pn_2
	tempfile temp
	save `temp'
restore
merge m:1 pn_2 using `temp', keep(1 3) nogen

preserve
	import delimited "${DataPath}/shapefiles/hospcoord_to_splitatstate.csv", clear
	keep pn split_id
	rename pn pn_1
	rename split_id split_id_1
	tempfile temp
	save `temp'
restore
merge m:1 pn_1 using `temp', keep(1 3) nogen

preserve
	import delimited "${DataPath}/shapefiles/hospcoord_to_splitatstate.csv", clear
	keep pn split_id
	rename pn pn_2
	rename split_id split_id_2
	tempfile temp
	save `temp'
restore
merge m:1 pn_2 using `temp', keep(1 3) nogen

gen     RACregion_1 = 1 if inlist(state_1, "ME", "VT", "NH", "MA", "CT", "RI", "NJ", "NY", "PA")
replace RACregion_1 = 1 if inlist(state_1, "MD", "DE")
replace RACregion_1 = 2 if inlist(state_1, "MN", "WI", "MI", "IL", "IN", "MI", "OH", "KY")
replace RACregion_1 = 3 if inlist(state_1, "WV", "VA", "NC", "SC", "TN", "GA", "AL", "MS", "FL")
replace RACregion_1 = 3 if inlist(state_1, "LA", "AR", "OK", "TX", "NM", "CO", "DC", "PR")
replace RACregion_1 = 4 if inlist(state_1, "ND", "SD", "IA", "MO", "KS", "NE", "MT", "WY", "ID")
replace RACregion_1 = 4 if inlist(state_1, "UT", "AZ", "NV", "WA", "OR", "CA", "AK", "HI")

gen     RACregion_2 = 1 if inlist(state_2, "ME", "VT", "NH", "MA", "CT", "RI", "NJ", "NY", "PA")
replace RACregion_2 = 1 if inlist(state_2, "MD", "DE")
replace RACregion_2 = 2 if inlist(state_2, "MN", "WI", "MI", "IL", "IN", "MI", "OH", "KY")
replace RACregion_2 = 3 if inlist(state_2, "WV", "VA", "NC", "SC", "TN", "GA", "AL", "MS", "FL")
replace RACregion_2 = 3 if inlist(state_2, "LA", "AR", "OK", "TX", "NM", "CO", "DC", "PR")
replace RACregion_2 = 4 if inlist(state_2, "ND", "SD", "IA", "MO", "KS", "NE", "MT", "WY", "ID")
replace RACregion_2 = 4 if inlist(state_2, "UT", "AZ", "NV", "WA", "OR", "CA", "AK", "HI")

// conditions for being in the same comparison group
// 1. not same RAC
gen diffRAC = cond(RACregion_1 != RACregion_2, 1, 0)

// 2. within 150 miles away
gen within150m = cond(dist_mile <= 150, 1, 0)

// 3. same state segment
gen samesegment = cond(split_id_1 == split_id_2, 1, 0)

// all 3
gen ingroup_150 = (diffRAC & within150m & samesegment)

gen sameRACgroup_150 = (diffRAC == 0 & within150m & samesegment)


bys pn_1: egen tot_comparehosp_150 = total(ingroup_150)

egen tag = tag(pn_1)
sum tot_comparehosp_150 if tag, detail

save "${DataPath}/within150m/pn_distmatrix.dta", replace

// define groups by within 150m
use "${DataPath}/within150m/pn_distmatrix.dta", clear
keep if ingroup_150 == 1 | pn_1 == pn_2
sort pn_1 pn_2

gen strL hospcomp_group_150 = ""

levelsof pn_1, local(hosplist)
foreach hosp of local hosplist{
	di "`hosp'"
	 qui levelsof pn_2 if pn_1 == `hosp', local(pn_2list)
	 replace hospcomp_group_150 = "`pn_2list'" if pn_1 == `hosp'
}

bys pn_1: egen n_hospcomp_ingroup_150 = count(pn_2)

// a list of hospitals and the comparison groups (circles around hospital of radius 100 mile) they are a part of, repeated for the number of times they are in a group
keep pn_2 hospcomp_group_150 n_hospcomp_ingroup_150 split_id_2 state_2
rename pn_2 pn
rename split_id_2 split_id
rename state_2 state
bys pn: egen n_group_partof_150 = count(hospcomp_group_150)
egen tag = tag(pn)
sum n_group_partof_150 if tag, detail

egen tag2 = tag(hospcomp_group_150)
sum n_hospcomp_ingroup_150 if tag2, detail

drop tag*
save "${DataPath}/within150m/pnlist_groups_150m.dta", replace



log close

